###############################################################
###############################################################
#### Stefan Müller, Samuel Brazys, and Alexander Dukalskis
#### Replication Scripts for: 
#### Discourse Wars and 'Mask Diplomacy': China's Global Image Management in Times of Crisis 
#### Political Research Exchange, 2024
#### Link to paper: https://doi.org/10.1080/2474736X.2024.2309178
###############################################################
###############################################################

## Note: check the 000_README.pdf file on Harvard Dataverse for 
## the full replication instructions and information on all code scripts.
## Link to Dataverse repository: https://doi.org/10.7910/DVN/KRXMXJ
## Please contact the authors if you have any questions or suggestions. 
## Note: due to copyright restrictions some of the files cannot be shared publicly.
## However, we provide all replication scripts and intermediate objects to reproduce
## the plots and tables included in the paper and Supporting Information.

## This file performs the Latent Sentiment Score (LSS) analysis for 
## the English and translated articles.

# load packages
library(dplyr)
library(ggplot2)
library(quanteda)
library(quanteda.textstats)
library(LSX)
library(xtable)
library(cowplot)
library(rio)
library(texreg)
library(readr)

# load custom ggplot2 scheme
source("function_theme_base.R")

# load corpus of English and translated articles
# note: the data cannot be shared due to privacy concerns
dat_full <- readRDS("data_dontshare/data_texts_full.rds")

# filter only English articles
dat_en <- filter(dat_full, language == "en")

length(unique(dat_en$domain))

length(unique(dat_en$country))

# select only sentences that contain text 
dat_full_clean_valid <- filter(dat_full, 
                               !is.na(text))

# check how often the term virus relates to Covid-19

# get sentences mentioning virus or Virus
dat_virus <- filter(dat_full_clean_valid, 
                    grepl("virus|Virus", text)) |> 
    filter(language == "en")

# get sample of 200 sentences
set.seed(235)
dat_virus_sample <- sample_n(dat_virus, size = 200) |> 
    dplyr::select(pattern, text, docname, domain, country, type, language)
# export for manual annotation
# rio::export(dat_virus_sample, "data_handcode/data_virus_sample.xlsx")

# get valid English sentences
dat_full_clean_valid_en <- filter(dat_full_clean_valid, 
                                  language == "en")

# create quanteda text corpus
corp_china_mentioned <- corpus(dat_full_clean_valid_en,
                               text_field = "text")

# check how Covid-19 is tokenized
tokens("covid-19")
# Tokens consisting of 1 document.
# text1 :
#     [1] "covid-19"


# prepare corpus for LSS analysis
toks_sent <- corp_china_mentioned %>% 
    tokens(remove_punct = TRUE, remove_numbers = TRUE) %>% 
    tokens_remove(pattern = stopwords("en"),
                  padding = TRUE)


# create document feature matrix and follow pre-processing
# recommended in Watanabe (2021) and LSX documentation
dfmt_sent <- toks_sent %>% 
    dfm() %>% 
    dfm_keep(valuetype = "regex",
             min_nchar = 4) %>% 
    dfm_trim(min_termfreq = 10, min_docfreq = 5)


# get most frequent terms
freqs <- textstat_frequency(dfmt_sent)


# check features starting with covid*
dfmt_sent %>% 
    dfm_select(pattern = "covid*") %>% 
    topfeatures()

# load seed words and transform them into a quanteda
# dictionary format
dat_seedwords <- read.csv("data_sentiment_seed.csv")

# get positive terms and negative terms
pos <- filter(dat_seedwords, !is.na(positive))
neg <- filter(dat_seedwords, !is.na(negative))

# bind into a quanteda dictionary
data_dictionary_china <- dictionary(list(
    positive = pos$positive,
    negative = neg$negative))

# inspect dictionary
data_dictionary_china

# check that all sentences contain one of the 
# covid-related terms
table(dfmt_sent$covid)

# train lss model
lss <- textmodel_lss(dfmt_sent, 
                     as.seedwords(data_dictionary_china),
                     k = 300, 
                     cache = FALSE)

# save LSS output for later use
saveRDS(lss, "data_dontshare/lss_model.rds")


# load LSS object if necessary
# lss <- readRDS("data_dontshare/lss_model.rds")

# create text corpus of China mentions
corp_china_mentioned_all <- corpus(dat_full_clean_valid,
                                   text_field = "text")


# prepare corpus for LSS analysis
toks_sent_all <- corp_china_mentioned_all %>% 
    tokens(remove_punct = TRUE, remove_numbers = TRUE) %>% 
    tokens_remove(pattern = stopwords("en"),
                  padding = TRUE)


# construct document-feature matrix
dfmt_sent_all <- toks_sent_all %>% 
    dfm() %>% 
    dfm_keep(valuetype = "regex",
             min_nchar = 4) %>% 
    dfm_trim(min_termfreq = 10, min_docfreq = 5)

# get frequencies
freqs_all <- textstat_frequency(dfmt_sent_all)

# predict sentences for English and non-English texts
pred <- as.data.frame(predict(lss, 
                              se_fit = TRUE, # store standard error
                              newdata = dfmt_sent_all))


head(pred)

# combine predictions and ALL data
dat_docvars <- data.frame(text = as.character(corp_china_mentioned_all),
                          docvars(corp_china_mentioned_all))

# get doc-id for merging
pred$doc_id_merge <- rownames(pred)

# merge/bind sentences
pred_sentences <- bind_cols(pred, dat_docvars)

nrow(pred_sentences)

# save predictions for each sentence
# saveRDS(pred_sentences, "data_dontshare/data_texts_lss.rds")

pred_sentences <- readRDS("data_dontshare/data_texts_lss.rds")

# get frequency of terms
frequency <- lss$frequency[names(lss$beta)] # fix for < v1.1.4

# store terms and scores in data frame
dat_neg_pos <- data.frame(word = names(lss$beta), 
                          beta = lss$beta,
                          freq = frequency,
                          stringsAsFactors = FALSE)

# store LSS polarity scores
write_csv(dat_neg_pos, "data_scores_lss_words.csv")

# get relevant words from Whitepaper
dat_words <- import("data_dictionary_china_positive.xlsx")

dat_words_whitepaper <- rio::import("data_topfeatures_fighting_covid.xlsx") %>% 
    filter(feature %in% c(
        "prevention",
        "help",
        "fight",
        "commitment",
        "coordination",
        "assistance",
        "support",
        "solidarity"))

# store words in vector
words_china <- dat_words$word


words_whitepaper <- dat_words_whitepaper$feature

# check which words are Whitepaper terms in LSS polarity data frame
dat_neg_pos <- dat_neg_pos %>% 
    mutate(whitepaper_word = ifelse(word %in% words_whitepaper, TRUE, FALSE)) %>% 
    mutate(china_word = ifelse(word %in% words_china, TRUE, FALSE))

# get only terms mentioned at least 300 ties
dat_neg_pos_subset <- dat_neg_pos %>% 
    filter(freq > 300)

# standardise coefficients
dat_neg_pos <- dat_neg_pos %>% 
    mutate(coef_stand = (coef - mean(coef)) / sd(coef))

# run regression to check coefficients for China words
lm1 <- lm(coef_stand ~ china_word, data = dat_neg_pos)

screenreg(lm1)

# as expected: more positive!

# get most positive and most negative sentences
nrow(pred_sentences)

pred_sentences_unique <- pred_sentences %>% 
    select(fit, text) %>% 
    unique()

# 30 most positive sentences
pred_sent_pos <- pred_sentences_unique %>% 
    top_n(30, wt = fit) %>% 
    mutate(class = "Positive") 

# 30 most negative sentences
pred_sent_neg <- pred_sentences_unique %>% 
    top_n(-30, wt = fit) %>% 
    mutate(class = "Negative")

pred_sent_top <- bind_rows(pred_sent_neg,
                           pred_sent_pos) %>% 
    arrange(-fit) %>% 
    rename(`Text Score` = fit, Text = text, 
           Category = class)

# 200 neutral sentences
set.seed(5)
pred_sent_neutral <- pred_sentences_unique %>% 
    filter(between(fit, -0.05, 0.05)) %>% 
    sample_n(size = 200) %>% 
    mutate(class = "Neutral")

# Table A03 
x <- xtable(pred_sent_top)
print(x, type = "html", include.rownames = FALSE,
      file = "tab_a03.html")
